# Always print this out before your assignment
sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods
## [7] base
##
## other attached packages:
## [1] knitr_1.36
##
## loaded via a namespace (and not attached):
## [1] digest_0.6.28 R6_2.5.1 jsonlite_1.7.2 magrittr_2.0.1
## [5] evaluate_0.14 rlang_0.4.11 stringi_1.7.5 jquerylib_0.1.4
## [9] bslib_0.3.1 rmarkdown_2.10 tools_4.1.1 stringr_1.4.0
## [13] xfun_0.25 yaml_2.2.1 fastmap_1.1.0 compiler_4.1.1
## [17] htmltools_0.5.2 sass_0.4.0
getwd()
## [1] "/Users/angpham/Desktop/CPSC_Courses/MGSC310/final_project"
# Load all your libraries in this chunk
library('tidyverse')
library('dplyr')
library('ggplot2')
library('forcats')
library('rsample')
library('glmnet')
library('glmnetUtils')
# NOTE: Do not run install.packages() inside a code chunk. Install them in the console outside of a code chunk.
tracks <- read.csv(here::here("spotify_dataset", "spotify_tracks.csv"))
The follwing link details each of the features in the ‘tracks’ dataset: https://developer.spotify.com/documentation/web-api/reference/#/operations/get-several-tracks)
tracks %>% glimpse()
## Rows: 101,939
## Columns: 32
## $ X <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, …
## $ acousticness <dbl> 0.294, 0.863, 0.750, 0.763, 0.770, 0.971, …
## $ album_id <chr> "0D3QufeCudpQANOR7luqdr", "1bcqsH5UyTBzmh9…
## $ analysis_url <chr> "https://api.spotify.com/v1/audio-analysis…
## $ artists_id <chr> "['3mxJuHRn2ZWD5OofvJtDZY']", "['4xWMewm6C…
## $ available_markets <chr> "['AD', 'AE', 'AR', 'AT', 'AU', 'BE', 'BG'…
## $ country <chr> "BE", "BE", "BE", "BE", "BE", "BE", "BE", …
## $ danceability <dbl> 0.698, 0.719, 0.466, 0.719, 0.460, 0.367, …
## $ disc_number <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ duration_ms <dbl> 235584, 656960, 492840, 316578, 558880, 18…
## $ energy <dbl> 0.6060, 0.3080, 0.9310, 0.1260, 0.9420, 0.…
## $ href <chr> "https://api.spotify.com/v1/tracks/5qljLQu…
## $ id <chr> "5qljLQuKnNJf4F4vfxQB0V", "3VAX2MJdmdqARLS…
## $ instrumentalness <dbl> 0.00000269, 0.00000000, 0.00000000, 0.0000…
## $ key <dbl> 10, 6, 4, 3, 7, 11, 10, 3, 5, 5, 6, 0, 4, …
## $ liveness <dbl> 0.1510, 0.2530, 0.9380, 0.1130, 0.9170, 0.…
## $ loudness <dbl> -7.447, -10.340, -13.605, -20.254, -13.749…
## $ lyrics <chr> "\n\nPerhaps I am bound to be restless\nAl…
## $ mode <dbl> 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, …
## $ name <chr> "Blood", "The Ugly Duckling", "Jimmy Launc…
## $ playlist <chr> "Hipsteribrunssi", "Animal Stories", "Best…
## $ popularity <dbl> 28, 31, 31, 14, 32, 45, 0, 26, 17, 29, 47,…
## $ preview_url <chr> "https://p.scdn.co/mp3-preview/1b05a902da3…
## $ speechiness <dbl> 0.0262, 0.9220, 0.9440, 0.9380, 0.9430, 0.…
## $ tempo <dbl> 115.018, 115.075, 79.565, 112.822, 81.260,…
## $ time_signature <dbl> 4, 3, 4, 3, 4, 4, 3, 4, 4, 4, 4, 4, 3, 4, …
## $ track_href <chr> "https://api.spotify.com/v1/tracks/5qljLQu…
## $ track_name_prev <chr> "track_14", "track_3", "track_4", "track_9…
## $ track_number <dbl> 1, 3, 4, 1, 2, 8, 2, 11, 6, 6, 1, 3, 12, 5…
## $ uri <chr> "spotify:track:5qljLQuKnNJf4F4vfxQB0V", "s…
## $ valence <dbl> 0.6220, 0.5890, 0.0850, 0.5330, 0.0906, 0.…
## $ type <chr> "track", "track", "track", "track", "track…
Below is a summary of the relevant features that will be used for this question. This includes genral information about a track, popularity, and its audio features:
- id: ID of th track, as identified by Spotify
- name: Name of track
- href: Link to Spotify API for complete information about a track
- uri: Unique identifier to access track on Spotify
- artists_id: List of artists IDs for a track
- album_id: Album ID of the album the track is a part of
- duration_ms: Length of track
- popularity: Popularity score for a track based on Spotify algorithms (Spotify indicates that it is largely based on number of plays.)
- acousticness: Confidence measure from 0.0 to 1.0 of whether the track is acoustic
- danceability: How suitable a track is for dancing based on a combination of musical elements including tempo, rhythm stability, beat strength, and overall regularity
- energy: Measure from 0.0 to 1.0 and represents a perceptual measure of intensity and activity
- instrumentalness: Predicts whether a track contains no vocals
- key: Key the track is in. Integers map to pitches using standard Pitch Class notation
- liveness: Detects the presence of an audience in the recording
- loudness: Quality of a sound that is the primary psychological correlate of physical strength (amplitude), measured in decibels (dB)
- mode: Modality (major or minor) of a track
- speechiness: Presence of spoken words in a track
- tempo: Speed or pace of a given piece in beats per minute (BPM)
- time_signature: Notational convention to specify how many beats are in each bar (or measure)
- valence: Musical positiveness conveyed by a track
tracks_clean <- na.omit(tracks) # Omit any tracks with an na value in any column
tracks_clean <- tracks[, c("id", "name", "href", "uri", "artists_id", "album_id", "duration_ms", "popularity", "acousticness", "danceability", "energy", "instrumentalness", "key", "liveness", "loudness", "mode", "speechiness", "tempo", "time_signature", "valence")]
tracks_clean %>% glimpse()
## Rows: 101,939
## Columns: 20
## $ id <chr> "5qljLQuKnNJf4F4vfxQB0V", "3VAX2MJdmdqARLSU…
## $ name <chr> "Blood", "The Ugly Duckling", "Jimmy Launch…
## $ href <chr> "https://api.spotify.com/v1/tracks/5qljLQuK…
## $ uri <chr> "spotify:track:5qljLQuKnNJf4F4vfxQB0V", "sp…
## $ artists_id <chr> "['3mxJuHRn2ZWD5OofvJtDZY']", "['4xWMewm6CY…
## $ album_id <chr> "0D3QufeCudpQANOR7luqdr", "1bcqsH5UyTBzmh9Y…
## $ duration_ms <dbl> 235584, 656960, 492840, 316578, 558880, 183…
## $ popularity <dbl> 28, 31, 31, 14, 32, 45, 0, 26, 17, 29, 47, …
## $ acousticness <dbl> 0.294, 0.863, 0.750, 0.763, 0.770, 0.971, 0…
## $ danceability <dbl> 0.698, 0.719, 0.466, 0.719, 0.460, 0.367, 0…
## $ energy <dbl> 0.6060, 0.3080, 0.9310, 0.1260, 0.9420, 0.3…
## $ instrumentalness <dbl> 0.00000269, 0.00000000, 0.00000000, 0.00000…
## $ key <dbl> 10, 6, 4, 3, 7, 11, 10, 3, 5, 5, 6, 0, 4, 5…
## $ liveness <dbl> 0.1510, 0.2530, 0.9380, 0.1130, 0.9170, 0.6…
## $ loudness <dbl> -7.447, -10.340, -13.605, -20.254, -13.749,…
## $ mode <dbl> 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1…
## $ speechiness <dbl> 0.0262, 0.9220, 0.9440, 0.9380, 0.9430, 0.0…
## $ tempo <dbl> 115.018, 115.075, 79.565, 112.822, 81.260, …
## $ time_signature <dbl> 4, 3, 4, 3, 4, 4, 3, 4, 4, 4, 4, 4, 3, 4, 4…
## $ valence <dbl> 0.6220, 0.5890, 0.0850, 0.5330, 0.0906, 0.1…
summary(tracks_clean)
## id name href
## Length:101939 Length:101939 Length:101939
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## uri artists_id album_id
## Length:101939 Length:101939 Length:101939
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## duration_ms popularity acousticness danceability
## Min. : 1155 Min. : 0.00 Min. :0.0000 Min. :0.000
## 1st Qu.: 184000 1st Qu.:29.00 1st Qu.:0.0407 1st Qu.:0.480
## Median : 216893 Median :41.00 Median :0.2380 Median :0.610
## Mean : 246771 Mean :39.78 Mean :0.3521 Mean :0.586
## 3rd Qu.: 261055 3rd Qu.:52.00 3rd Qu.:0.6450 3rd Qu.:0.714
## Max. :5505831 Max. :97.00 Max. :0.9960 Max. :0.989
## energy instrumentalness key
## Min. :0.0000 Min. :0.0000000 Min. : 0.000
## 1st Qu.:0.4110 1st Qu.:0.0000000 1st Qu.: 2.000
## Median :0.6290 Median :0.0000375 Median : 5.000
## Mean :0.5865 Mean :0.1487759 Mean : 5.271
## 3rd Qu.:0.7980 3rd Qu.:0.0344000 3rd Qu.: 8.000
## Max. :1.0000 Max. :1.0000000 Max. :11.000
## liveness loudness mode speechiness
## Min. :0.0000 Min. :-60.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0956 1st Qu.:-11.149 1st Qu.:0.0000 1st Qu.:0.0364
## Median :0.1240 Median : -7.599 Median :1.0000 Median :0.0506
## Mean :0.1976 Mean : -9.463 Mean :0.6182 Mean :0.1288
## 3rd Qu.:0.2410 3rd Qu.: -5.509 3rd Qu.:1.0000 3rd Qu.:0.1040
## Max. :0.9990 Max. : 2.719 Max. :1.0000 Max. :0.9690
## tempo time_signature valence
## Min. : 0.00 Min. :0.000 Min. :0.0000
## 1st Qu.: 95.97 1st Qu.:4.000 1st Qu.:0.2710
## Median :118.07 Median :4.000 Median :0.4770
## Mean :118.36 Mean :3.876 Mean :0.4828
## 3rd Qu.:136.04 3rd Qu.:4.000 3rd Qu.:0.6930
## Max. :244.03 Max. :5.000 Max. :0.9930
Below is a histogram showing the spread of the ‘popularity’ variable. From the plot below, we can see that the ‘popularity’ variable is normally distributed from 0 - 100, with 100 indicating most popular. The ‘popularity’ variable is not ordered by rank. Spotify indicates it largely based on number of plays, but includes other factors as well.
ggplot(tracks_clean, aes(popularity)) + geom_histogram(binwidth = 5) + theme_minimal()
The following plots show relationships between the popularity scores and each audio feature. The plots are not linear; however, some plots, like danceability and energy clearly show that higher values in those audio features indicate more popular music, as indicated by darker points in the top right hand corner.
ggplot(tracks_clean, aes(acousticness, popularity)) + geom_point(alpha = 0.05, color = "#5F4690") + geom_smooth() + theme_minimal()
ggplot(tracks_clean, aes(danceability, popularity)) + geom_point(alpha = 0.05, color = "#1D6996") + geom_smooth() + theme_minimal()
ggplot(tracks_clean, aes(energy, popularity)) + geom_point(alpha = 0.05, color = "#38A6A5") + geom_smooth() + theme_minimal()
ggplot(tracks_clean, aes(instrumentalness, popularity)) + geom_point(alpha = 0.05, color = "#0F8554") + geom_smooth() + theme_minimal()
ggplot(tracks_clean, aes(key, popularity)) + geom_point(alpha = 0.05, color = "#73AF48") + geom_smooth() + theme_minimal()
ggplot(tracks_clean, aes(liveness, popularity)) + geom_point(alpha = 0.05, color = "#EDAD08") + geom_smooth() + theme_minimal()
ggplot(tracks_clean, aes(loudness, popularity)) + geom_point(alpha = 0.05, color = "#E17C05") + geom_smooth() + theme_minimal()
ggplot(tracks_clean, aes(mode, popularity)) + geom_point(alpha = 0.05, color = "#6F4070") + geom_smooth() + theme_minimal()
ggplot(tracks_clean, aes(speechiness, popularity)) + geom_point(alpha = 0.05, color = "#94346E") + geom_smooth() + theme_minimal()
ggplot(tracks_clean, aes(tempo, popularity)) + geom_point(alpha = 0.05, color = "#CC503E") + geom_smooth() + theme_minimal()
ggplot(tracks_clean, aes(time_signature, popularity)) + geom_point(alpha = 0.05, color = "#994E95") + geom_smooth() +theme_minimal()
ggplot(tracks_clean, aes(valence, popularity)) + geom_point(alpha = 0.05, color = "#1DB954FF") + geom_smooth() +theme_minimal()
# Split data
tracks_split <- initial_split(tracks_clean, prop = 0.70) # 70%-30% train-test split
tracks_train <- training(tracks_split)
tracks_test <- testing(tracks_split)
# Create model
lr_simple <- lm(popularity ~ acousticness + danceability + energy + instrumentalness + key + liveness + loudness + mode +
speechiness + tempo + time_signature + valence,
data = tracks_train)
# Summarize model
summary(lr_simple)
##
## Call:
## lm(formula = popularity ~ acousticness + danceability + energy +
## instrumentalness + key + liveness + loudness + mode + speechiness +
## tempo + time_signature + valence, data = tracks_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.309 -10.897 -0.133 10.972 62.215
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39.019192 0.720172 54.180 < 0.0000000000000002
## acousticness -0.616999 0.259494 -2.378 0.01742
## danceability 10.923183 0.430506 25.373 < 0.0000000000000002
## energy -1.862048 0.461447 -4.035 0.00005460
## instrumentalness -0.709679 0.230822 -3.075 0.00211
## key -0.027454 0.016596 -1.654 0.09807
## liveness 1.701168 0.360877 4.714 0.00000243
## loudness 0.449215 0.017860 25.151 < 0.0000000000000002
## mode 0.007194 0.122794 0.059 0.95329
## speechiness -23.611448 0.333765 -70.743 < 0.0000000000000002
## tempo 0.002754 0.002015 1.367 0.17171
## time_signature 1.322444 0.119332 11.082 < 0.0000000000000002
## valence -5.318778 0.287457 -18.503 < 0.0000000000000002
##
## (Intercept) ***
## acousticness *
## danceability ***
## energy ***
## instrumentalness **
## key .
## liveness ***
## loudness ***
## mode
## speechiness ***
## tempo
## time_signature ***
## valence ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.63 on 71344 degrees of freedom
## Multiple R-squared: 0.139, Adjusted R-squared: 0.1388
## F-statistic: 959.7 on 12 and 71344 DF, p-value: < 0.00000000000000022
preds_train_lr_simple <- predict(lr_simple, newdata = tracks_train)
preds_test_lr_simple <- predict(lr_simple, newdata = tracks_test)
results <- data.frame(
`preds` = c(preds_test_lr_simple, preds_train_lr_simple),
`true` = c(tracks_test$popularity, tracks_train$popularity),
`type` = c(rep("Test", nrow(tracks_test)), rep("Train", nrow(tracks_train))))
results
ggplot(results, aes(true, preds)) + geom_point(alpha = 0.05) +
facet_wrap(. ~ type) +
theme_minimal() +
labs(x = "True Track's Popularity",
y = "Predicted Tracks's Popularity")
get_rmse <- function(true_values, predictions) {
sqrt(mean((true_values - predictions) ^ 2))
}
get_mae <- function(true_values, predictions){
mean(abs(true_values - predictions))
}
get_rmse(tracks_train$popularity, preds_train_lr_simple)
## [1] 15.62576
get_rmse(tracks_test$popularity, preds_test_lr_simple)
## [1] 15.57006
get_mae(tracks_train$popularity, preds_train_lr_simple)
## [1] 12.65871
get_mae(tracks_test$popularity, preds_test_lr_simple)
## [1] 12.61185
ridge_fit <- cv.glmnet(popularity ~ acousticness + danceability + energy + instrumentalness + key + liveness
+ loudness + mode + speechiness + tempo + time_signature + valence,
data = tracks_train, alpha = 0)
print(ridge_fit)
## Call:
## cv.glmnet.formula(formula = popularity ~ acousticness + danceability +
## energy + instrumentalness + key + liveness + loudness + mode +
## speechiness + tempo + time_signature + valence, data = tracks_train,
## alpha = 0)
##
## Model fitting options:
## Sparse model matrix: FALSE
## Use model.frame: FALSE
## Number of crossvalidation folds: 10
## Alpha: 0
## Deviance-minimizing lambda: 0.5189662 (+1 SE): 3.039596
plot(ridge_fit)
coef(ridge_fit, s = "lambda.min")
## 13 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 38.570159373
## acousticness -0.662433093
## danceability 10.189906148
## energy -1.276831247
## instrumentalness -0.736573538
## key -0.027030431
## liveness 1.249890299
## loudness 0.422609024
## mode 0.008885713
## speechiness -22.809851155
## tempo 0.002753637
## time_signature 1.349099133
## valence -4.941543881
lasso_fit <- cv.glmnet(popularity ~ acousticness + danceability + energy + instrumentalness + key + liveness
+ loudness + mode + speechiness + tempo + time_signature + valence,
data = tracks_train, alpha = 1)
print(lasso_fit)
## Call:
## cv.glmnet.formula(formula = popularity ~ acousticness + danceability +
## energy + instrumentalness + key + liveness + loudness + mode +
## speechiness + tempo + time_signature + valence, data = tracks_train,
## alpha = 1)
##
## Model fitting options:
## Sparse model matrix: FALSE
## Use model.frame: FALSE
## Number of crossvalidation folds: 10
## Alpha: 1
## Deviance-minimizing lambda: 0.008457856 (+1 SE): 0.4209495
plot(lasso_fit)
coef(lasso_fit, s = "lambda.min")
## 13 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 38.884372839
## acousticness -0.529384838
## danceability 10.858629158
## energy -1.603181170
## instrumentalness -0.705714089
## key -0.025352048
## liveness 1.580783603
## loudness 0.443180756
## mode .
## speechiness -23.568684213
## tempo 0.002464728
## time_signature 1.311516476
## valence -5.276537779
enet_mod <- cva.glmnet(popularity ~ acousticness + danceability + energy + instrumentalness + key + liveness
+ loudness + mode + speechiness + tempo + time_signature + valence,
data = tracks_train,
alpha = seq(0, 1, by = 0.05))
print(enet_mod)
## Call:
## cva.glmnet.formula(formula = popularity ~ acousticness + danceability +
## energy + instrumentalness + key + liveness + loudness + mode +
## speechiness + tempo + time_signature + valence, data = tracks_train,
## alpha = seq(0, 1, by = 0.05))
##
## Model fitting options:
## Sparse model matrix: FALSE
## Use model.frame: FALSE
## Alpha values: 0 0.05 0.1 0.15 0.2 0.25 0.3 0.35 0.4 0.45 0.5 0.55 0.6 0.65 0.7 0.75 0.8 0.85 0.9 0.95 1
## Number of crossvalidation folds for lambda: 10
minlossplot(enet_mod, cv.type = "min")
# Function to find the best alpha
get_alpha <- function(fit) {
alpha <- fit$alpha
error <- sapply(fit$modlist,
function(mod) {min(mod$cvm)})
alpha[which.min(error)]
}
# Extract the best alpha value
best_alpha <- get_alpha(enet_mod)
print(best_alpha)
## [1] 0.8
# Create best Elastic Net model
best_enet_mod <- cva.glmnet(popularity ~ acousticness + danceability + energy + instrumentalness + key + liveness
+ loudness + mode + speechiness + tempo + time_signature + valence,
data = tracks_train,
alpha = best_alpha)
print(best_enet_mod)
## Call:
## cva.glmnet.formula(formula = popularity ~ acousticness + danceability +
## energy + instrumentalness + key + liveness + loudness + mode +
## speechiness + tempo + time_signature + valence, data = tracks_train,
## alpha = best_alpha)
##
## Model fitting options:
## Sparse model matrix: FALSE
## Use model.frame: FALSE
## Alpha values: 0.8
## Number of crossvalidation folds for lambda: 10